{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://mp.weixin.qq.com/s?__biz=MzI0NTQ1NDc4Nw==&mid=2247485451&idx=1&sn=eb7a785dbd8728ff2d16c72429d7f461&chksm=e94f0aa7de3883b10f51c8de23702ea1277f2d70eae64570c60711c89cf159bd5b7004dee9f4&scene=21#wechat_redirect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "from gensim.corpora import Dictionary\n",
    "from gensim.models import ldamodel\n",
    "from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full\n",
    "import numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "texts =  [\n",
    "  ['苹果','叶子','椭圆形','树上'],           \n",
    "  ['植物','叶子','绿色','落叶乔木'],           \n",
    "  ['水果','苹果','红彤彤','味道'],            \n",
    "  ['苹果','落叶乔木','树上','水果'],           \n",
    "  ['植物','营养','维生素'],           \n",
    "  ['营养','维生素','苹果','成分'],            \n",
    "  ['互联网','电脑','智能手机','高科技'],         \n",
    "  ['苹果','公司','互联网','品质'],          \n",
    "  ['乔布斯','苹果','硅谷'],          \n",
    "  ['电脑','智能手机','苹果','乔布斯'],         \n",
    "  ['苹果','电脑','品质','生意'],         \n",
    "  ['电脑','品质','乔布斯'],          \n",
    "  ['苹果','公司','生意','硅谷']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "dictionary = Dictionary(texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.170*\"苹果\" + 0.087*\"电脑\" + 0.064*\"品质\" + 0.063*\"乔布斯\" + 0.053*\"智能手机\" + 0.052*\"水果\" + 0.051*\"互联网\" + 0.049*\"树上\" + 0.049*\"公司\" + 0.042*\"生意\"'),\n",
       " (1,\n",
       "  '0.093*\"苹果\" + 0.082*\"植物\" + 0.075*\"维生素\" + 0.074*\"营养\" + 0.060*\"叶子\" + 0.058*\"落叶乔木\" + 0.051*\"成分\" + 0.049*\"绿色\" + 0.048*\"硅谷\" + 0.045*\"乔布斯\"')]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "numpy.random.seed(2019) # 设置随机种子以获得每次相同的结果。\n",
    "model = ldamodel.LdaModel(corpus, id2word=dictionary, iterations=100, num_topics=2)\n",
    "model.show_topics(num_words=10) #展示主题模型，显示每个主题下的TOP10主题词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "model.num_terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "doc1 = ['树上', '叶子', '植物']\n",
    "doc2 = ['乔布斯', '智能手机', '互联网']\n",
    "doc3 = [ '营养', '苹果','维生素']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "bow1 = model.id2word.doc2bow(doc1)  \n",
    "bow2 = model.id2word.doc2bow(doc2)   \n",
    "bow3 = model.id2word.doc2bow(doc3)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "lda_bow1 = model[bow1]\n",
    "lda_bow2 = model[bow2]\n",
    "lda_bow3 = model[bow3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.046444595428538994"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "hellinger(lda_bow1, lda_bow3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4615085597971655"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hellinger(lda_bow1, lda_bow2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.505161552997494"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "hellinger(lda_bow2, lda_bow3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.505161552997494"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "hellinger(lda_bow2, lda_bow3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.008893967"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "kullback_leibler(lda_bow1, lda_bow3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0357668"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "kullback_leibler(lda_bow2, lda_bow3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.1175619"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "# 如您所见，两者的值不相等。我们稍后会介绍其中的细节\n",
    "kullback_leibler(lda_bow3, lda_bow2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.24727881), (1, 0.7527212)]\n",
      "[(0, 0.8584863), (1, 0.14151369)]\n",
      "[(0, 0.19269583), (1, 0.8073042)]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(model.get_document_topics(bow1))\n",
    "print(model.get_document_topics(bow2))\n",
    "print(model.get_document_topics(bow3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "jaccard(bow1, bow2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "jaccard(doc1, doc2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jaccard(['苹果','大树','营养'], ['苹果','大树','营养'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "topic_公司, topic_水果 = model.show_topics() \n",
    "#一些预处理，以使距离度量以可接受的数据形式来获得主题\n",
    "def make_topics_bow(topic):\n",
    "# 获取由model.show_topics（）返回的字符串，分割字符串以便分别获取主题和概率    \n",
    "  topic = topic.split('+')#用于存储主题bows的list    \n",
    "  topic_bow = []\n",
    "  for word in topic:#分隔概率和词汇       \n",
    "   prob, word = word.split('*')#去掉空格       \n",
    "   word = word.replace(\" \",\"\")       \n",
    "   word = word.replace('\"','')#词汇表示转换\n",
    "   #print(word)       \n",
    "   word = model.id2word.doc2bow([word])[0][0]       \n",
    "   topic_bow.append((word, float(prob)))\n",
    "   return topic_bow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(3, 0.093)]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "公司_distribution = make_topics_bow(topic_公司[1])\n",
    "水果_distribution = make_topics_bow(topic_水果[1])\n",
    "# 以主题词汇加权表示的“水果”主题\n",
    "水果_distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(3, 0.093)]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "公司_distribution = make_topics_bow(topic_公司[1])\n",
    "水果_distribution = make_topics_bow(topic_水果[1])\n",
    "# 以主题词汇加权表示的“水果”主题\n",
    "水果_distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "index 3 is out of bounds for axis 0 with size 1",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-25-4c0566349301>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mkullback_leibler\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m公司_distribution\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0m水果_distribution\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\matutils.py\u001b[0m in \u001b[0;36mkullback_leibler\u001b[1;34m(vec1, vec2, num_features)\u001b[0m\n\u001b[0;32m    965\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    966\u001b[0m     \"\"\"\n\u001b[1;32m--> 967\u001b[1;33m     \u001b[0mvec1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvec2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_convert_vec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvec1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvec2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnum_features\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnum_features\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    968\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mentropy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvec1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvec2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    969\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\matutils.py\u001b[0m in \u001b[0;36m_convert_vec\u001b[1;34m(vec1, vec2, num_features)\u001b[0m\n\u001b[0;32m    933\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    934\u001b[0m             \u001b[0mmax_len\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvec1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvec2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 935\u001b[1;33m             \u001b[0mdense1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msparse2full\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvec1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmax_len\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    936\u001b[0m             \u001b[0mdense2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msparse2full\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvec2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmax_len\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    937\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mdense1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdense2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\gensim\\matutils.py\u001b[0m in \u001b[0;36msparse2full\u001b[1;34m(doc, length)\u001b[0m\n\u001b[0;32m    401\u001b[0m     \u001b[0mdoc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    402\u001b[0m     \u001b[1;31m# overwrite some of the zeroes with explicit values\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 403\u001b[1;33m     \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mitervalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    404\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    405\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mIndexError\u001b[0m: index 3 is out of bounds for axis 0 with size 1"
     ]
    }
   ],
   "source": [
    "kullback_leibler(公司_distribution,水果_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "topic1, topic2 = model.show_topics(num_words=len(model.id2word))\n",
    "#再次进行词袋表示转换\n",
    "公司_distribution = make_topics_bow(topic1[1])\n",
    "水果_distribution = make_topics_bow(topic2[1])\n",
    "# 返回kullback_leibler值\n",
    "kullback_leibler(水果_distribution, 公司_distribution,22)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.07590900821378677"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "# 常规的Hellinger\n",
    "hellinger(水果_distribution, 公司_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.07590900821378677"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hellinger(公司_distribution, 水果_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hellinger(水果_distribution, 水果_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4615085597971655"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hellinger(lda_bow1, lda_bow2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5079531552257045"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "hellinger(lda_bow1, lda_bow2) + hellinger(lda_bow1, lda_bow3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kullback_leibler(水果_distribution, 公司_distribution,22)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "kullback_leibler(公司_distribution, 水果_distribution,22)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
